# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
from ..items import NovelCrawlItem


class BookSpider(CrawlSpider):
    name = 'book'
    allowed_domains = ['xs.sogou.com']
    start_urls = ['https://xs.sogou.com/nansheng/','https://xs.sogou.com/nvsheng/']

    rules = (
        # 提取匹配'https://xs.sogou.com/1_0_0_0_heat/?pageNo=\d+'的链接
        # 并使用spider的parse方法进行分析，然后跟进链接
        #请注释parse！！！！
        Rule(LinkExtractor(allow=(r'pageNo=\d+',)), callback='parse_nansheng',follow=True),
        Rule(LinkExtractor(allow=(r'pageNo=\d+',)), callback='parse_nvsheng', follow=True)
    )
    def parse_nansheng(self,response):
        book_list = response.xpath('//li[@class="fl clear"]')
        for book_msg in  book_list:
            book_url = book_msg.xpath('//a[@class="cover fl"]/@href').extract()
            for url in book_url:
                full_url = "https://xs.sogou.com"+url
                yield scrapy.Request(url=full_url,callback=self.parse_item,dont_filter=True)
            #print(book_url)

        #print(response.url)

    def parse_nvsheng(self,response):
        book_list = response.xpath('//li[@class="fl clear"]')
        for book_msg in book_list:
            book_url = book_msg.xpath('//a[@class="cover fl"]/@href').extract()
            for url in book_url:
                full_url = "https://xs.sogou.com"+url
                yield scrapy.Request(url=full_url,callback=self.parse_item,dont_filter=True)
            #print(book_url)
        #print(response.url)


    def parse_item(self, response):
        item = NovelCrawlItem()
        item["book_name"] = response.xpath('//h1[@class="text-title"]/a/text()').extract()[0]  #小说名字
        book_content = response.xpath('//div[@class="field clear"]/span/text()').extract()
        #作者名
        item["writer"] = book_content[0][3:]
        #类型
        item["book_type"] = response.xpath('//div[@class="field clear"]/span[2]/a/text()').extract()[0]
        #状态
        item["state"] = response.xpath('//div[@class="field clear"]/span[3]/a/text()').extract()[0]
        #字数
        item["word_num"] = book_content[3][3:]
        #价格
        item["piece"] = book_content[5][3:]
        #简介
        clean_data = response.xpath('//div[@class="desc desc-short"]/text()').extract()[0].replace('\r\n ','').strip()
        item["introduce"] = clean_data
        yield item

